In [ ]:
from __future__ import print_function, unicode_literals
from dbpedia_utils import iter_entities_from
from collections import defaultdict, Counter
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import gensim
import json
import gzip
import nltk
import dbpedia_config
In [ ]:
source_folder = dbpedia_config.DATA_FOLDER
target_folder = dbpedia_config.TARGET_FOLDER
abstracts_file = '{0}/long_abstracts_{1}.nt.bz2'.format(source_folder, dbpedia_config.MAIN_LANGUAGE)
text_language = 'english'
First, we load person data to process only biographies present in our dataset.
In [ ]:
person_data = pd.read_csv('{0}/person_data_en.csv.gz'.format(target_folder), encoding='utf-8', index_col='uri')
person_data.head()
Here we read the biography overviews to train our gensim co-llocations model. Note that you need NLTK to parse sentences.
In [ ]:
def sentences():
for i, entity in enumerate(iter_entities_from(abstracts_file)):
resource = entity['resource']
if resource in person_data.index:
try:
abstract = entity['abstract'].pop()
if abstract:
for sentence in nltk.sent_tokenize(abstract, language=text_language):
yield list(gensim.utils.tokenize(sentence, deacc=True, lowercase=True))
except KeyError:
continue
bigrams = gensim.models.Phrases(sentences())
In [ ]:
bigrams.save('{0}/biography_overviews_bigrams.gensim'.format(target_folder))
Now that we have trained our model, we can identify bi-grams in biographies. Now, we will construct a vocabulary dictionary:
{gender => {word => # of biographies}}
In [ ]:
vocabulary = defaultdict(Counter)
for i, entity in enumerate(iter_entities_from(abstracts_file)):
resource = entity['resource']
if resource in person_data.index:
try:
abstract = entity['abstract'].pop()
if not abstract:
#some biographies have an empty abstract.
continue
gender = person_data.loc[resource].gender
for sentence in nltk.sent_tokenize(abstract, language=text_language):
n_grams = bigrams[list(gensim.utils.tokenize(sentence, deacc=True, lowercase=True))]
vocabulary[gender].update(set(n_grams))
except KeyError:
# some biographies do not have an abstract.
continue
And we save it in a structure to be used in the following notebooks.
In [ ]:
with gzip.open('{0}/vocabulary.json.gz'.format(target_folder), 'wb') as f:
json.dump(vocabulary, f)